import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn import metrics
from scipy.stats import zscore
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import cdist
from scipy.spatial.distance import pdist
from scipy.cluster.hierarchy import cophenet, dendrogram, linkage
from scipy.cluster.hierarchy import fcluster
from sklearn.metrics import silhouette_score
df = pd.read_excel("Credit Card Customer Data.xlsx")
df.head()
#1
df.shape
#Shape, Data Types
df.info()
#Removing 'Sl_No' column as it all the unique value and this column wont provide any insight to build a model
df.drop('Sl_No', axis=1, inplace=True)
df.describe().transpose()
df.nunique()
# There are 655 Unique Customer key. So, there must be 5 duplicate customer keys
#Check All duplicate Customer Keys.
df[df.duplicated(subset=['Customer Key'],keep=False)]
#5 duplicate customer keys should be removed
for i in df.columns:
sns.distplot(df[i],hist=False,)
plt.show()
#Looks like there are 3 or 4 clusters. Need to use elbow method to confirm
#Check for outliers using box plots
plt.figure(figsize=(15,10))
pos = 1
for i in df.columns:
plt.subplot(3, 3, pos)
sns.boxplot(df[i])
pos += 1
# There are several outliers for Average Credit limit and Total Visits Online
corr = df.corr()
sns.heatmap(corr, annot = True)
#Total_calls_made is highly negatively correlated with Total_Credit_Cards
#Total_calls_made is negatively correlated with Avg_Credit_Limit and Total_visits_bank
#Total_visits_online is negatively correlated with Total_visits_bank
#Avg_Credit_Limit is positively correlated with Total_Credit_Cards and Total_visits_online
sns.pairplot(df,diag_kind='kde');
#Looks like we might need 4 clusters based on Total_Credit_Cards. Let's try elbow method to confirm
import pandas_profiling
#Getting the pandas profiling report and check for incorrect imputation
pandas_profiling.ProfileReport(df)
#2
#There are duplicate Customer key. We need to remove the duplicates
#Based on the graphs we might need 3 or 4 clusters, we can confirm the same using the elbow method later.
#There are several outliers for Avg_Credit_Limit and Total_visits_online
df[df.duplicated(subset=['Customer Key'],keep='last')]
df.drop_duplicates(subset=['Customer Key'],keep='last', inplace=True)
#To confirm duplicates are dropped
df[df.duplicated(subset=['Customer Key'],keep='last')]
##Scale the data using zscore
df_z = df.apply(zscore)
#3
#### Finding optimal no. of clusters for scaled data
clusters=range(1,10)
meanDistortions1=[]
for k in clusters:
model1=KMeans(n_clusters=k)
model1.fit(df_z)
prediction=model1.predict(df_z)
meanDistortions1.append(sum(np.min(cdist(df_z, model1.cluster_centers_, 'euclidean'), axis=1)) / df_z
.shape[0])
plt.plot(clusters, meanDistortions1, 'bx-')
plt.xlabel('k')
plt.ylabel('Average distortion')
plt.title('Selecting k with the Elbow Method')
#Based on Elbow Method, 3 or 5 clusters.
#Let's try the value of 3 for K first
kmeans3 = KMeans(n_clusters=3, n_init = 15, random_state=1)
kmeans3.fit(df_z)
k3_prediction=kmeans3.predict(df_z)
#Append the prediction
df["K3_GROUP"] = k3_prediction
df_z["K3_GROUP"] = k3_prediction
df.head()
df_z.head(10)
#Analyze the distribution of the data among the three groups (K = 3)
df_groupedk3 = df_z.groupby(['K3_GROUP'])
df_groupedk3.mean()
#Perform Value counts
df_groupedk3['K3_GROUP'].value_counts()
df_z.boxplot(by = 'K3_GROUP', layout=(2,4), figsize=(20, 15))
#Let's try the value of 5 for K as the next elbow is at 5
kmeans5 = KMeans(n_clusters=5, n_init = 15, random_state=1)
kmeans5.fit(df_z)
k5_prediction=kmeans5.predict(df_z)
#Append the k-means prediction for K=5
df["K5_GROUP"] = k5_prediction
df_z["K5_GROUP"] = k5_prediction
df_z.head(10)
#Analyze the distribution of the data among the three groups (K = 3)
df_groupedk5 = df_z.groupby(['K5_GROUP'])
df_groupedk5.mean()
#Perform Value counts for K=5
df_groupedk5['K5_GROUP'].value_counts()
#Box plots for K=5
df_z.boxplot(by = 'K5_GROUP', layout=(2,4), figsize=(40, 30))
#4
# cophenet index is a measure of the correlation between the distance of points in feature space and distance on dendrogram
# closer it is to 1, the better is the clustering
Z_ea = linkage(df_z, metric='euclidean', method='average')
c_ea, coph_dists_ea = cophenet(Z_ea , pdist(df_z))
print("cophenet index for Average method with Euclidean distance :",c_ea)
plt.figure(figsize=(10, 5))
plt.title('Agglomerative Hierarchical Clustering Dendogram')
plt.xlabel('sample index')
plt.ylabel('Distance')
dendrogram(Z_ea, leaf_rotation=90.,color_threshold = 40, leaf_font_size=8. )
plt.tight_layout()
Z_ec = linkage(df_z, metric='euclidean', method='complete')
c_ec, coph_dists_ec = cophenet(Z_ec , pdist(df_z))
print("cophenet index for Complete method with Euclidean distance :",c_ec)
plt.figure(figsize=(10, 5))
plt.title('Agglomerative Hierarchical Clustering Dendogram')
plt.xlabel('sample index')
plt.ylabel('Distance')
dendrogram(Z_ec, leaf_rotation=90.,color_threshold = 40, leaf_font_size=8. )
plt.tight_layout()
Z_ew = linkage(df_z, metric='euclidean', method='ward')
c_ew, coph_dists_ew = cophenet(Z_ew , pdist(df_z))
print("cophenet index for Ward method with Euclidean distance :",c_ew)
plt.figure(figsize=(10, 5))
plt.title('Agglomerative Hierarchical Clustering Dendogram')
plt.xlabel('sample index')
plt.ylabel('Distance')
dendrogram(Z_ew, leaf_rotation=90.,color_threshold = 40, leaf_font_size=8. )
plt.tight_layout()
# Cophenet index for Euclidean distance is maximum for Average method : 0.9185047026907476
model_a3 = AgglomerativeClustering(n_clusters=3, affinity='euclidean', linkage='average')
model_a3.fit(df_z)
#Append the Agglo prediction for 3 clusters
df["A3_GROUP"] = model_a3.labels_
df_z["A3_GROUP"] = model_a3.labels_
df_z.head(10)
#Analyze the distribution of the data among the three CLUSTERS
df_groupeda3 = df_z.groupby(['A3_GROUP'])
df_groupeda3.mean()
df_z['A3_GROUP'].value_counts()
#Box plots for 3 clusters
df_z.boxplot(by = 'A3_GROUP', layout=(4,8), figsize=(40, 30))
#5
#Silhouette Score for Hierarchical clustering
list_clusters = [2, 3, 4, 5, 6]
Total_score_loop = 0
for i in list_clusters:
model_loop = AgglomerativeClustering(n_clusters=i, affinity='euclidean', linkage='average')
model_loop.fit(df_z)
score_loop = silhouette_score(df_z, model_loop.fit_predict(df_z))
Total_score_loop = Total_score_loop + score_loop
print('Silhouette Score for Agglo with', i ,'clusters: %.2f' % score_loop)
print('Total Silhouette Score for AgglomerativeClustering : %.2f' % Total_score_loop)
Avg_score = Total_score_loop/5
print('Average Silhouette Score for AgglomerativeClustering : %.2f' % Avg_score)
#Silhouette Score for K-means clustering
list_clusters = [2, 3, 4, 5, 6]
Total_score_kloop = 0
for i in list_clusters:
kmeans_loop = KMeans(n_clusters=i, n_init = 15, random_state=1)
kmeans_loop.fit(df_z)
kmeans_loop_prediction=kmeans_loop.predict(df_z)
score_kloop = silhouette_score(df_z, kmeans_loop.labels_, metric='euclidean')
Total_score_kloop = Total_score_kloop + score_kloop
print('Silhouette Score for k-Means with', i ,'clusters: %.2f' % score_kloop)
print('Total Silhouette Score for k-Means Clustering : %.2f' % Total_score_kloop)
Avg_kscore = Total_score_kloop/5
print('Average Silhouette Score for k-Means Clustering : %.2f' % Avg_kscore)
#The silhouette ranges from −1 to +1, where a high value indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters.
#Based on Silhouette score 4 clusters are good for both K-Means and Hierarchical.
#Using K=4 as it has the highest Silhouette score
kmeans4 = KMeans(n_clusters=4, n_init = 14, random_state=1)
kmeans4.fit(df_z)
k4_prediction=kmeans4.predict(df_z)
#Append the k-means prediction for K=4
df["K4_GROUP"] = k4_prediction
df_z["K4_GROUP"] = k4_prediction
df_z.head(10)
k_centroids = kmeans4.cluster_centers_
k_centroids
#Analyze the distribution of the data among the four groups (K = 4)
df_groupedk4 = df_z.groupby(['K4_GROUP'])
df_groupedk4.mean()
#Perform Value counts for K=4
df_groupedk4['K4_GROUP'].value_counts()
#Box plots for K=4
df_z.boxplot(by = 'K4_GROUP', layout=(3,5), figsize=(40, 30))
from mpl_toolkits.mplot3d import Axes3D
## 3D plots of clusters
fig = plt.figure(figsize=(8, 6))
ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=20, azim=60)
labels = kmeans4.labels_
ax.scatter(df_z.iloc[:, 0], df_z.iloc[:, 1], df_z.iloc[:, 2],c=labels.astype(np.float), edgecolor='k')
ax.w_xaxis.set_ticklabels([])
ax.w_yaxis.set_ticklabels([])
ax.w_zaxis.set_ticklabels([])
ax.set_xlabel('Length')
ax.set_ylabel('Height')
ax.set_zlabel('Weight')
ax.set_title('3D plot of KMeans Clustering')
#AgglomerativeClustering for 4 clusters
model_a4 = AgglomerativeClustering(n_clusters=4, affinity='euclidean', linkage='average')
model_a4.fit(df_z)
#Append the Agglo prediction for 4 clusters
df["A4_GROUP"] = model_a4.labels_
df_z["A4_GROUP"] = model_a4.labels_
df_z.head(10)
#Analyze the distribution of the data among the four CLUSTERS
df_groupeda4 = df_z.groupby(['A4_GROUP'])
df_groupeda4.mean()
df_z['A4_GROUP'].value_counts()
#Box plots for 4 clusters
df_z.boxplot(by = 'A4_GROUP', layout=(4,8), figsize=(40, 40))
dendrogram(
Z_ea,
truncate_mode='lastp', # show only the last p merged clusters
p=4, # show only the last p merged clusters
)
plt.show()
#6
#Compare 4 cluster for K-Means and AgglomerativeClustering
df_z['A4_GROUP'].value_counts()
df_z['K4_GROUP'].value_counts()
##Based of the value counts,
#Cluster 0 in AgglomerativeClustering is same as Cluster 2 in K-Means clustering
#Cluster 2 in AgglomerativeClustering is same as Cluster 1 in K-Means clustering
#Cluster 3 in AgglomerativeClustering is same as Cluster 0 in K-Means clustering
#Cluster 1 in AgglomerativeClustering is same as Cluster 3 in K-Means clustering
#Mean below also confirms that
df_groupeda4.mean()
#7
##Group 0 is customers with medium Avg_Credit_Limit, medium Total_Credit_Cards,
#highest Total_visits_bank, low Total_visits_online and low Total_calls_made
##Group 1 is customers with highest Avg_Credit_Limit, highest Total_Credit_Cards,
#least Total_visits_bank, highest Total_visits_online and least Total_calls_made
##Group 2 is customers with least Avg_Credit_Limit, least Total_Credit_Cards,
#low Total_visits_bank, medium Total_visits_online and highest Total_calls_made
##Group 3 is customers with low Avg_Credit_Limit, low Total_Credit_Cards,
#low Total_visits_bank, medium Total_visits_online and medium Total_calls_made
#Key Questions:
##How many different segments of customers are there?
#There are 4 different segments of customers
##How are these segments different from each other?
#As already mentioned above
##Group 0 is customers with medium Avg_Credit_Limit, medium Total_Credit_Cards,
#high Total_visits_bank, low Total_visits_online and low Total_calls_made
##Group 1 is customers with highest Avg_Credit_Limit, highest Total_Credit_Cards,
#least Total_visits_bank, highest Total_visits_online and least Total_calls_made
##Group 2 is customers with least Avg_Credit_Limit, least Total_Credit_Cards,
#low Total_visits_bank, medium Total_visits_online and highest Total_calls_made
##Group 3 is customers with low Avg_Credit_Limit, low Total_Credit_Cards,
#low Total_visits_bank, medium Total_visits_online and medium Total_calls_made
##What are your recommendations to the bank on how to better market to and service these customers?
#Number of customers are high is Group 0 i.e., customers with medium Avg_Credit_Limit,
#medium Total_Credit_Cards, high Total_visits_bank, low Total_visits_online and low Total_calls_made
#Bank can concentrate more of these customers as they are high in number
##Customers with highest Total_Credit_Cards are using more online banking, so online marketing
#will work best for these customers
##Customers with least Avg_Credit_Limit and least Total_Credit_Cards are making more
#calls to the bank, so these customer can be reached by phone or phone marketing
#will work best for these customers